********************************************************************************
* matching_firm_eligible.do
* Purpose: Construct the ELIGIBLE FIRM SAMPLE for balance-table comparisons.
*
* The "eligible" sample includes ALL treated firms that satisfied selection
* criteria (size, non-missing covariates), regardless of whether they were
* successfully matched. Control firms included are only those that were
* actually matched. This broader sample is used in Table A8/A10 to assess
* pre-matching balance and to report means for unmatched firms.
*
* Key differences from matching_firm.do:
*   - The eligible list (firm_eligible_list.dta) retains ALL treated firms
*     (both matched and unmatched), plus matched control firms.
*   - A "matched" indicator distinguishes matched from unmatched eligible firms.
*   - Sector-level pair variables fall back to the firm's own sector when
*     no match exists (for unmatched treated firms).
*
* Output: $data/firm_eligible.dta        (event-study panel, eligible sample)
*         $data/firm_eligible_list.dta   (IDs of eligible firms)
********************************************************************************
global start_year 	= 2004
global end_year 	= 2016

**# matching
forvalues y = $start_year/$end_year {
	
	use $data/firm_`y', replace

	** drop firms that have less than 10 employees in year y **
	keep if PD7_AvgEmp_NonZero >= 10

	** drop firms with missing values for the match vars **
	drop if mi(naics) | mi(total_revenue) | mi(age) | mi(avg_wage)
	
	** Define treated for firms that go through an M&A event the following year **
	gen treated		= (DEAL_YEAR == `y' + 1)
	
	** drop other M&A firms that are treated in other years ** 
	drop if MnA_firm == 1 & treated == 0
	gsort -treated
	
	* create age, average payrolls, and revenue bins
	foreach var in avg_wage total_revenue age{
	
		if `var' == avg_wage{
			gquantiles 	bin_`var' = `var', xtile n(10)
		}
		else{
			gquantiles 	bin_`var' = `var', xtile n(15)
		}
	}
	
	* create interactions between bins
	* firms must be matches within the same cells
	gegen cell = group(naics2 OPAddressProvince bin_*)
	drop if cell == .
		
	* estimate pscore using the eligible sample
	reg treated c.total_revenue##c.total_revenue c.age##c.age c.avg_wage##c.avg_wage
	
	predict pscore

	gen pscore_og = pscore
	replace pscore = pscore + 10.0*cell
	
	** we have to sort the data for replication
	gen outcome = rnormal()
	
	*** Fix the sorting of data
	gsort entid_syn
		
	* matching
	psmatch2 treated, outcome(outcome) pscore(pscore) caliper(1) n(1) noreplacement
	
	gen		 id = _id if treated==0 & _weight==1
	replace	 id = _n1 if treated==1 & _weight==1
	
	gegen pairid = group(id)
	gsort pairid
	
	gen year_prior = `y'
	
	save $data/firm_eligible_`y'.dta,  replace
}

drop _all
**# Eligible Firms List
forvalues y = $start_year/$end_year{
	
	append using $data/firm_eligible_`y', keep(entid_syn pairid treated year_prior) force
}

drop if treated == 0 & mi(pairid)
save $data/firm_eligible_list.dta,  replace 

**# Eligible Firm Panel
forvalues y = 2001/2017{

	use $data/firm_`y', clear

	* keep all eligible treated firms and matched control firms
	merge 1:m 	entid_syn using $data/firm_eligible_list, keep(3) nogen

	save $data/firm_eligible_panel_`y', replace

}

drop _all
forvalues y = 2001/2017{
	append using $data/firm_eligible_panel_`y', force
	rm $data/firm_eligible_panel_`y'.dta
}

save $data/firm_eligible_intermid, replace

gsort pairid year_prior -treated year
**# Eligible Treated Firm and Matched Control Firm Panel
* create id and deal type variables
* define control variables suchas sector and age
* drop financial sectors
gegen 	firm_id     = group(entid_syn)
gegen 	id 			= group(firm_id year_prior)

replace DEAL 	 = . if year ~= year_prior
replace Acquirer = . if year ~= year_prior
replace DEAL_YEAR= . if year ~= year_prior

gen matched = ~mi(pairid)

** sector - - fill in for missing sector values
drop naics2
gegen	naics_mode	= mode(naics), by(entid_syn) minmode
replace naics		= naics_mode	if mi(naics)
gen 	naics2 		= int(naics/100)
replace naics2  	= 31 if naics2 == 32 | naics2 == 33
replace naics2  	= 44 if naics2 == 45
replace naics2  	= 48 if naics2 == 49
replace naics2 		= 54 if naics2 == 56 | naics2 == 61 | naics2 == 62

* the sector prior to the event
gen 	naics2_event	= naics2 if year == year_prior

**# Time Variables
gen 	t = year - (year_prior + 1)
replace t = 6 	if t >  5 	& ~mi(t)
replace t = -6 	if t < -5 	& ~mi(t)
tab t, gen(ds_)
levelsof t, local(ts)
local end = r(r)
forvalue i = 1(1)`end'{
	local temp = `i' - 7
	label variable ds_`i' "`temp'"
}
replace ds_6 = 0

* firm age - fill in for missing age values
drop 	dateinc dateinc_year age
gen 	temp_dateinc 	= dofc(BirthDate) if year == year_prior
gegen	dateinc 		= firstnm(temp_dateinc), by(id)
gen 	dateinc_year 	= yofd(dateinc)
gen 	age 			= year - dateinc_year
gen 	age2 			= age^2
gen 	age3 			= age^3
gen 	age4 			= age^4

replace Acquirer 	= . if ~(treated == 1 & year == year_prior)
replace merger 		= . if ~(treated == 1 & year == year_prior)

* pair-level variables
gsort pairid year_prior -treated year

gegen matched_acq_tmp	= firstnm(Acquirer),		by(entid_syn)
gegen matched_acq 		= firstnm(Acquirer),		by(pairid year_prior)
replace matched_acq 	= matched_acq_tmp if matched == 0

gegen matched_deal_type_tmp	= firstnm(merger), 			by(entid_syn) 
gegen matched_deal_type		= firstnm(merger), 			by(pairid year_prior) 
replace matched_deal_type 	= matched_deal_type_tmp if matched == 0

gegen matched_sector 	= firstnm(naics2_event),	by(entid_syn)
drop if matched_sector == 52 | matched_sector == 55


compress
save $data/firm_eligible, replace